The purpose of this workflow is to associate gene module expression (EOS, PMN) with host outcomes including:
Load packages
# Data manipulation and figures
library(tidyverse)
library(readxl)
library(cowplot)
library(tidytext)
# RNAseq data
library(limma)
#Correlation
library(corrplot)
#PLS
library(mixOmics)
# Print tty table to knit file
library(knitr)
library(kableExtra)
options(knitr.kable.NA = '')
`%notin%` <- Negate(`%in%`)
Set seed
set.seed(4389)
Load functions
source("scripts/corr.fxn.R")
load("data_clean/P337_BAL_data.RData")
When applicable, correlations will be completed for delta values (V5-V4) to capture the paired nature of these data.
Two donors are removed from BE expression data (MA1001 missing V4, MA1086 missing V5).
#Read in all module expression
mod.files <- list.files(path="results/module_level/", pattern="mod_voom_counts",
recursive = TRUE, full.names = TRUE)
counts.mod <- data.frame()
for(file in mod.files){
counts.temp <- read_csv(file) %>%
pivot_longer(-c(module,module.char), names_to="libID")
counts.mod <- rbind(counts.mod, counts.temp)
}
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## module = col_character(),
## module.char = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
##
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## module = col_character(),
## module.char = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
#Calculate delta
counts.mod.delta <- counts.mod %>%
#Remove modules 00
filter(!grepl("_00",module)) %>%
dplyr::select(-module.char) %>%
#Add metadata to denote V4, V5
full_join(dplyr::select(dat.BAL.abund.norm.voom$targets,
libID, donorID, visit), by = "libID") %>%
#separate V4 and V5
dplyr::select(-libID) %>%
pivot_wider(names_from = visit) %>%
mutate(delta = V5-V4) %>%
#shorten module names
mutate(module=gsub("module_P337_", "", module)) %>%
#wide format
dplyr::select(-V4,-V5) %>%
arrange(donorID, module) %>%
pivot_wider(names_from = module, values_from = delta)
plex.delta <- read_csv("data_raw/addtl.data/P337_BAL.multiplex.csv") %>%
rename(donorID=ptID, FGF2_V4=TGF2_V4) %>%
pivot_longer(-donorID) %>%
#Log10 transform
mutate(value = ifelse(value==0,0, log10(value))) %>%
separate(name, into=c("name","visit"), sep="_") %>%
pivot_wider(names_from = visit) %>%
mutate(delta = V5-V4) %>%
drop_na(delta) %>%
dplyr::select(-V4,-V5) %>%
arrange(name) %>%
mutate(name = paste(name, "multiplex", sep=".")) %>%
pivot_wider(values_from = delta)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## ptID = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
Check if cytokine genes are in expression modules as well.
| Module | Gene HGNC | Cytokine |
|---|---|---|
| EOS_01 | CCL13 | CCL13 |
| CCL2 | CCL2 | |
| CCL22 | CCL22 | |
| CCL26 | CCL26 | |
| CCL7 | CCL7 | |
| CXCL8 | IL8 | |
| CCL17 | TARC | |
| TGFA | TGFA | |
| EOS_02 | FLT3LG | FLT3L |
| IL10 | IL10 | |
| IL9 | IL9 | |
| VEGFA | VEGF | |
| EOS_04 | IL16 | IL16 |
| EOS_05 | IL1RN | IL1RA |
| EOS_06 | CCL8 | CCL8 |
| CXCL1 | CXCL1 | |
| IL13 | IL13 | |
| IL6 | IL6 | |
| CD40LG | sCD40L | |
| PMN_01 | CCL13 | CCL13 |
| CCL2 | CCL2 | |
| CCL7 | CCL7 | |
| CCL8 | CCL8 | |
| CXCL1 | CXCL1 | |
| IL10 | IL10 | |
| IL6 | IL6 | |
| CXCL8 | IL8 | |
| VEGFA | VEGF |
Donor MA1012 is removed due to movement issues in fMRI.
#Time point 1 = visit 4
neuro <- read_excel(sheet="T1",
"data_raw/addtl.data/extraced.clusters.Matt.Altman_wbaseline_psychdata.xlsx") %>%
#long format
pivot_longer(-idnum, names_to="neuro") %>%
#add visit variable
mutate(visit="V4")
#Time point 2 = visit 5
neuro <- read_excel(sheet="T2",
"data_raw/addtl.data/extraced.clusters.Matt.Altman_wbaseline_psychdata.xlsx") %>%
#long format
pivot_longer(-idnum, names_to="neuro") %>%
#add visit variable
mutate(visit="V5") %>%
#Combine with other visit
full_join(neuro) %>%
#Format idnum to match RNAseq data
mutate(idnum = paste("MA",idnum, sep="")) %>%
rename(donorID=idnum)
neuro.delta <- neuro %>%
filter(donorID != "MA1012" & neuro != "LSI" & neuro != "BDI") %>%
#Calculate delta
pivot_wider(names_from = visit) %>%
mutate(delta = V5-V4) %>%
#wide format
dplyr::select(-V4,-V5) %>%
pivot_wider(names_from = neuro, values_from = delta)
asthma.delta <- dat.BAL.abund.norm.voom$targets %>%
dplyr::select(donorID, FEV1.pctPP.preAlbuterol_V4:FeNO.PreBro_V5) %>%
distinct() %>%
#get visit from variable names
pivot_longer(-donorID) %>%
separate(name, into=c("asthma","visit"), sep="_") %>%
#Calculate delta
pivot_wider(names_from = visit) %>%
mutate(delta = V5-V4) %>%
#Wide format
dplyr::select(-V4,-V5) %>%
pivot_wider(names_from = asthma, values_from = delta)
Regression mode: Y matrix is deflated with respect to information extracted/modeled from local regression on X. Here, the goal is to predict Y from X (and vice versa).
Sparse: simultaneous variable selection in X and Y with LASSO penalization on each pair of loading vectors
http://mixomics.org/methods/spls/
Format data
X <- counts.mod.delta %>%
filter(donorID %in% neuro.delta$donorID) %>%
full_join(plex.delta) %>%
column_to_rownames("donorID")
## Joining, by = "donorID"
X <- X[complete.cases(X),]
Y <- neuro.delta %>%
filter(donorID %in% rownames(X)) %>%
column_to_rownames("donorID")
Perform PLS regression.
ncomp = 10
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")
#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")
Compute evaluation criteria for (S)PLS.
pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.
Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~40%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 25%.
Taken together, 2 X and 3 Y components will be used.
Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).
#X variables
spls.MAE.X <- tune.spls(X, Y,
ncomp = ncompX, #Determined above
test.keepX = c(2:30, 40),
validation = "Mfold",
folds = 10, nrepeat = 100,
progressBar = FALSE,
measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
ncomp = ncompY, #Determined above
test.keepX = c(2:10),
validation = "Mfold",
folds = 10, nrepeat = 10,
progressBar = FALSE,
measure = 'MAE')
#Save
dir.create("results/PLS/", showWarnings = FALSE)
save(spls.MAE.X, spls.MAE.Y,
file="results/PLS/MAE.RData")
Based on minimum MAE, the number of variables to keep per component is
| Space | Component | No. to keep |
|---|---|---|
| X | 1 | 24 |
| 2 | 8 | |
| Y | 1 | 9 |
| 2 | 10 | |
| 3 | 8 |
Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.
result.spls <- spls(X,Y,
keepX = spls.MAE.X$choice.keepX,
keepY = spls.MAE.Y$choice.keepX,
ncomp = ncompX, mode = "regression")
This results in the following variables across 2 components.
Correlations within SPLS
Format data
X <- counts.mod.delta %>%
filter(donorID %in% neuro.delta$donorID) %>%
column_to_rownames("donorID")
X <- X[complete.cases(X),]
Y <- neuro.delta %>%
filter(donorID %in% rownames(X)) %>%
column_to_rownames("donorID")
Perform PLS regression.
ncomp = 9
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")
#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")
Compute evaluation criteria for (S)PLS.
pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.
Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~40%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 25%.
Taken together, 2 X and 2 Y components will be used.
Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).
#X variables
spls.MAE.X <- tune.spls(X, Y,
ncomp = ncompX, #Determined above
test.keepX = c(2:9),
validation = "Mfold",
folds = 10, nrepeat = 100,
progressBar = FALSE,
measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
ncomp = ncompY, #Determined above
test.keepX = c(2:10),
validation = "Mfold",
folds = 10, nrepeat = 10,
progressBar = FALSE,
measure = 'MAE')
#Save
save(spls.MAE.X, spls.MAE.Y,
file="results/PLS/MAE3.RData")
Based on minimum MAE, the number of variables to keep per component is
| Space | Component | No. to keep |
|---|---|---|
| X | 1 | 2 |
| 2 | 8 | |
| Y | 1 | 10 |
| 2 | 2 |
Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.
result.spls <- spls(X,Y,
keepX = spls.MAE.X$choice.keepX,
keepY = spls.MAE.Y$choice.keepX,
ncomp = ncompX, mode = "regression")
This results in the following variables across 2 components.
Correlations within SPLS
Format data
X <- plex.delta %>%
filter(donorID %in% neuro.delta$donorID) %>%
column_to_rownames("donorID")
X <- X[complete.cases(X),]
Y <- neuro.delta %>%
filter(donorID %in% rownames(X)) %>%
column_to_rownames("donorID")
Perform PLS regression.
ncomp = 10
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")
#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")
Compute evaluation criteria for (S)PLS.
pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.
Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~40%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 25%.
Taken together, 2 X and 2 Y components will be used.
Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).
#X variables
spls.MAE.X <- tune.spls(X, Y,
ncomp = ncompX, #Determined above
test.keepX = c(2:30,40),
validation = "Mfold",
folds = 10, nrepeat = 100,
progressBar = FALSE,
measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
ncomp = ncompY, #Determined above
test.keepX = c(2:10),
validation = "Mfold",
folds = 10, nrepeat = 10,
progressBar = FALSE,
measure = 'MAE')
#Save
save(spls.MAE.X, spls.MAE.Y,
file="results/PLS/MAE4.RData")
Based on minimum MAE, the number of variables to keep per component is
| Space | Component | No. to keep |
|---|---|---|
| X | 1 | 24 |
| 2 | 2 | |
| Y | 1 | 9 |
| 2 | 10 |
Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.
result.spls <- spls(X,Y,
keepX = spls.MAE.X$choice.keepX,
keepY = spls.MAE.Y$choice.keepX,
ncomp = ncompX, mode = "regression")
This results in the following variables across 2 components.
Correlations within SPLS
Format data
X <- counts.mod.delta %>%
filter(donorID %in% asthma.delta$donorID) %>%
full_join(plex.delta) %>%
column_to_rownames("donorID")
## Joining, by = "donorID"
X <- X[complete.cases(X),]
Y <- asthma.delta %>%
filter(donorID %in% rownames(X)) %>%
column_to_rownames("donorID")
Perform PLS regression.
ncomp = 10
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")
#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")
Compute evaluation criteria for (S)PLS.
pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
progressBar = FALSE)
Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.
Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~45%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 20%.
Taken together, 2 X and 2 Y components will be used.
Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).
#X variables
spls.MAE.X <- tune.spls(X, Y,
ncomp = ncompX, #Determined above
test.keepX = c(2:30, 40),
validation = "Mfold",
folds = 10, nrepeat = 100,
progressBar = FALSE,
measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
ncomp = ncompY, #Determined above
test.keepX = c(2:3),
validation = "Mfold",
folds = 10, nrepeat = 10,
progressBar = FALSE,
measure = 'MAE')
#Save
save(spls.MAE.X, spls.MAE.Y,
file="results/PLS/MAE2.RData")
Based on minimum MAE, the number of variables to keep per component is
| Space | Component | No. to keep |
|---|---|---|
| X | 1 | 6 |
| 2 | 40 | |
| Y | 1 | 3 |
| 2 | 2 |
Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.
result.spls <- spls(X,Y,
keepX = spls.MAE.X$choice.keepX,
keepY = spls.MAE.Y$choice.keepX,
ncomp = ncompX, mode = "regression")
This results in the following variables across 2 components.
Correlations within SPLS
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.3.1 knitr_1.30 mixOmics_6.12.2 lattice_0.20-41
## [5] MASS_7.3-53 corrplot_0.84 limma_3.44.3 tidytext_0.3.0
## [9] cowplot_1.1.1 readxl_1.3.1 forcats_0.5.0 stringr_1.4.0
## [13] dplyr_1.0.3 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [17] tibble_3.0.5 ggplot2_3.3.3 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 viridisLite_0.3.0 jsonlite_1.7.2 ellipse_0.4.2
## [5] modelr_0.1.8 assertthat_0.2.1 highr_0.8 selectr_0.4-2
## [9] cellranger_1.1.0 yaml_2.2.1 pillar_1.4.7 backports_1.2.1
## [13] glue_1.4.2 digest_0.6.27 RColorBrewer_1.1-2 rvest_0.3.6
## [17] colorspace_2.0-0 htmltools_0.5.1 Matrix_1.3-2 plyr_1.8.6
## [21] pkgconfig_2.0.3 broom_0.7.3 haven_2.3.1 corpcor_1.6.9
## [25] webshot_0.5.2 scales_1.1.1 RSpectra_0.16-0 farver_2.0.3
## [29] generics_0.1.0 ellipsis_0.3.1 withr_2.4.0 cli_2.2.0
## [33] magrittr_2.0.1 crayon_1.3.4 evaluate_0.14 tokenizers_0.2.1
## [37] janeaustenr_0.1.5 fs_1.5.0 fansi_0.4.2 SnowballC_0.7.0
## [41] xml2_1.3.2 tools_4.0.2 hms_1.0.0 lifecycle_0.2.0
## [45] matrixStats_0.57.0 munsell_0.5.0 reprex_0.3.0 compiler_4.0.2
## [49] rlang_0.4.10 grid_4.0.2 rstudioapi_0.13 igraph_1.2.6
## [53] labeling_0.4.2 rmarkdown_2.6 gtable_0.3.0 DBI_1.1.1
## [57] rARPACK_0.11-0 reshape2_1.4.4 R6_2.5.0 gridExtra_2.3
## [61] lubridate_1.7.9.2 stringi_1.5.3 parallel_4.0.2 Rcpp_1.0.6
## [65] vctrs_0.3.6 dbplyr_2.0.0 tidyselect_1.1.0 xfun_0.20